/********************************************************************************
Discrimination in Multi-Phase Systems: Evidence from Child Protection

Created on: 12/28/2022
Last Modified on: 2/17/2024

Description: This program generates investigator-level placement and outcome
rates, as well as standard errors for these estimates. Our baseline maltreatment 
outcome is an investigation within six months. This program generates other 
outcomes, such as re-investigation with five, four, three, and two months of 
the focal investigation.

The program takes as an input a child by investigation level dataset spanning 
January 2008 to December 2019, subject to the sample restrictions discussed in 
the paper. Using this child by investigation level dataset, the program then 
estimates investigator-specific rates and standard errors using a linear adjustment 
to account for randomization strata, which we discuss in the paper.  

Note that we have removed the file directory names from this program for 
confidentiality reasons.
********************************************************************************/

**************************
**(0) SETUP
**************************
clear
set more off
macro drop all
capture log close
set seed 02042023

*Set directories
global cleand 
global tmpd 
global output 
global data_raw

**************************
**(1) GEN RE-INVESTIGATION OUTCOMES WITHIN DIFFERENT TIME PERIODS 
**************************
import delimited ${data_raw}allegations.csv, encoding(UTF-8) clear

**Flag allegations that did not result in formal investigations
gen cw_flag_screenedin=screeningdecision=="Accept and Assign for field investigation"
la var cw_flag_screenedin "Allegation was formally investigated"

**Flag children that were not the subject of the investigations
gen cw_flag_child_victim=child_role=="Alleged Victim (AV)"
la var cw_flag_child_victim "Child was alleged victim"
keep if cw_flag_child_victim==1 & cw_flag_screenedin==1

**Gen consistent investigation id throughout the sample period 
rename complaint_date cw_date
la var cw_date "CW- Allegation Report Date"
gen complaint_date = date(cw_date,"MDY")
replace intake_id = investigation_caseid if complaint_date<20933
rename (intakechildpartyid intake_id) (vicid inv_caseid) 

**Keep and rename relevant variables
keep vicid county_name complaint_date inv_caseid allegationtypedesc cw_flag* relationtypeperptovictim finding catdesc intakeperppartyid cw_date  

rename county_name cw_county
la var cw_county "CW- County of Investigation"

rename allegation cw_allegation
la var cw_allegation "CW- Allegation Type"

rename relationtypeperptovictim relationship 
la var relationship "CW - Relationship of alleged perp to child"

rename finding cw_sub
la var cw_sub "CW- Substantatiated"

*Gen foster care and substantiation indicators 
gen fc = catdesc=="1"
replace fc=0 if cw_sub=="No Preponderance" | cw_sub=="No Evidence" | cw_sub==""
gen preponderance = cw_sub=="Preponderance"

foreach x in fc preponderance {
	gegen tmp_`x' = max(`x'), by(vicid inv_caseid)
	drop `x'
	rename tmp_`x' `x'
}

gduplicates drop vicid inv_caseid, force 

*For a given focal child X inv observation, did you have another *investigation* within 1 month? 2 months? ... 5 months?
foreach x in 1 2 3 4 5 {
	gen inv`x'm=.
	sort vicid complaint_date inv_caseid, stable 
	bysort vicid: replace inv`x'm = 1 if inrange(complaint_date[_n+1], complaint_date+1, complaint_date+`x'*30)
	replace inv`x'm = 0 if inv`x'm ==.
	replace inv`x'm =. if complaint_date+`x'*30>date("11/20/2019","MDY")
	label var inv`x'm "Subject of another investigation within `x' months"
}

keep vicid inv_caseid inv*

save "${tmpd}inv_shorter_time_horizons.dta", replace 


**************************
**(2) ESTIMATE INVESTIGATOR-LEVEL PLACEMENT AND SUBSEQUENT MALTREATMENT RATES
**************************
use "${cleand}analysis_sample_investigators_qje.dta", clear 
cap drop _merge 
merge 1:1 vicid inv_caseid using "${tmpd}inv_shorter_time_horizons.dta", keepus(inv*) keep(1 3)

keep worker_id vicid rotationgroup pre_black fc inv2m inv3m inv4m inv5m count_inv bshare 
rename pre_black black 
gen white = black==0

global fixedeffect rotationgroup 
sum $fixedeffect

//get levels of invid
egen grpworker_id = group(worker_id)
levelsof grpworker_id
local levels = "`r(levels)'"

preserve 
foreach var in fc inv5m inv4m inv3m inv2m {
	qui {
	if "`var'"=="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black, absorb($fixedeffect) resid
	}
	
	* For other outcomes, condition on left at home
	if "`var'"!="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black if fc==0, resid absorb($fixedeffect)
	}
	}
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen b_`var' = .
	gen b_se_`var' = .
	gen w_`var' = .
	gen w_se_`var' = .
   
   	* Quietly
	qui {
	* White children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		if _rc == 0 {
            replace w_`var' =  r(estimate) if grpworker_id == `i'
			replace w_se_`var'= r(se) if  grpworker_id==`i'
		}
	}
	* Black children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + _b[`i'.grpworker_id#1.black] + `exp'
		if _rc == 0 {
            replace b_`var' =  r(estimate) if grpworker_id == `i'
			replace b_se_`var'= r(se) if  grpworker_id==`i'
		}
	}
	}
	drop x xbd xb d
}

* Save investigator-level dataset 
keep b_* w_* worker_id grpworker_id
duplicates drop worker_id, force 	
drop if worker_id == .
save "${tmpd}inv_rates_shorter_horizons_qje.dta", replace
restore 


* Clustered standard errors:
foreach var in fc inv5m inv4m inv3m inv2m {
	qui {
	if "`var'"=="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black, resid absorb($fixedeffect) cluster(worker_id vicid)
	}
	
	* For other outcomes, condition on left at home
	if "`var'"!="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black if fc==0, resid absorb($fixedeffect) cluster(worker_id vicid)
	}
	}
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen b_twse_`var' = .
	gen w_twse_`var' = .
   
   	* Quietly
	qui {
	* White children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		if _rc == 0 {
			replace w_twse_`var'= r(se) if  grpworker_id==`i'
		}
	}
	* Black children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + _b[`i'.grpworker_id#1.black] + `exp'
		if _rc == 0 {
			replace b_twse_`var'= r(se) if  grpworker_id==`i'
		}
	}
	}
	drop x xbd xb d
}

cap drop count_inv 
bys worker_id: gen count_inv = _N
bys worker_id: egen count_black = total(black)
bys worker_id : egen count_white = total(white)

* Keep relevant vars
keep b_twse_* w_twse_* worker_id count_inv count_black count_white grpworker_id
duplicates drop worker_id, force 	
drop if worker_id == .

* Merge clustered standard errors with rates and save dataset
merge 1:1 worker_id using "${tmpd}inv_rates_shorter_horizons_qje.dta", keep(1 3) keepus(b_fc w_fc b_* w_*)

	foreach x in w_fc b_fc {
		replace `x' = 1 - `x'
	}	
	
	gen sh_black = count_black/count_inv 
	sum sh_black [aw=count_inv] 
	gen bshare = r(mean)
		
	rename (w_fc b_fc) (D_w D_b)
	foreach x in D_w D_b {
		replace `x' = 0 if `x'<0 
		replace `x' = 1 if `x'>1 
	}
	
	gen D_w2 = D_w*D_w 
	gen D_b2 = D_b*D_b

save "${cleand}inv_rates_se_shorter_horizons_qje.dta", replace